library(tidyverse)
library(here)
library(rio)
library(janitor)
library(stringr)
library(colorblindr)
library(glue)
We are using data from the Fragile Families & Child Well-Being Study (Princeton) for this project. The study is longitudinal, with data collected from child participants, their parents/guardians, and their teachers at multiple time points (e.g., Baseline, Year 9, Year 15). More information about the study can be found at the following links:
Our original research questions were:
After obtaining the dataset and beginning to work with it, we did not move forward with RQ2. In addition, we decided to explore the relationship between internalizing and externalizing behaviors at age 9 and the number of suspensions/expulsions at age 15.
The complete data file has more than 17,000 variables. We’ll use the meta-data file, which is like a data dictionary (lists all variables and associated information, such as when it was collected and an explanation of what the variable is), to select the variables of interest, and use the ff_vars vector to subset the actual data file.
For this project, we’re interested in:
#importing meta-data
ff_meta <- import(here("data","FFMetadata_v09.csv")) %>%
clean_names() %>%
as_tibble()
#select scales
scales <- c("Aggravation in Parents",
"Self-Description Questionnaire (SDQ)",
"Delinquent Behavior")
#further narrow the meta-data file to variables of interest
ff_meta_subset <- ff_meta %>%
#demographic characteristics - race/ethnicity
filter((topics == "Demographics" &
str_detect(varlab,
"self-description of race/ethnicity")) |
#gender
(subtopics == "sex/gender" &
str_detect(varlab, "Focal baby's gender")) |
#scales
scale %in% scales|
#outcomes from questionnaire
(source == "questionnaire" &
subtopics == "student experiences" &
str_detect(varlab,"suspen"))) %>%
#only include relevant columns
select(new_name,
varlab,
topics,
subtopics,
type,
scale,
source,
respondent,
survey,
wave,
starts_with("label"))
Our meta-data subset includes the following information:
#use variable names to subset the dataset
ff_vars <- ff_meta_subset$new_name
#Import the full, original dataset. Note this will not work for peer reviewers. We are only sharing the narrowed dataset on github.
#ff <- import(here("data","FF_allwaves_2020v2_SPSS.sav")) %>%
# clean_names()
# ff_sub <- ff %>%
# select(idnum, #identifier
# all_of(ff_vars)) %>% #our selected variables
# as_tibble()
#let's save the subsetted df so that we don't have to clean each time
#export(ff_sub, here("data","ff_sub.Rda"))
ff_sub_orig <- import(here("data","ff_sub.Rda"))
#AW: I appended "_orig" here to retain an original version of the df
The variables (columns) have the following attributes:
#let's join the meta data to the subset so that we know which scale it comes from
ff_sub_long <- ff_sub_orig %>%
pivot_longer(
cols = -1,
names_to = "response"
) %>%
left_join(ff_meta, by = c("response" = "new_name"))
nest_by()#Let's make some example plots
plots <- ff_sub_long %>%
filter(scale == "Self-Description Questionnaire (SDQ)" & value >= 0) %>%
nest_by(varlab) %>%
summarise(
plot = list(
ggplot(data, aes(x = as.factor(value))) +
geom_bar() +
labs(
title = glue::glue("{varlab}")
)
)
)
walk(plots$plot, print)
#Calculates subscores for internalizing and externalizing behaviors at age 9
ff_sub <- ff_sub_orig %>%
mutate(int_scores = (k5g2a + k5g2c + k5g2e + k5g2g +
k5g2i + k5g2j + k5g2k + k5g2l) / 8,
ext_scores = (k5g2b + k5g2d + k5g2f +
k5g2h + k5g2m + k5g2n) / 6) %>%
filter(int_scores >= 0, ext_scores >= 0)
#AW: I think that scores lower than 0 should be filtered out first. Filtering out after calculating the mean will make some participant's scores appear lower than they should be because any negative-coded values are being subtracted from other responses. E.g., let's say a participant had -3, -3, 3, 3, 3, 3. The negative responses are getting subtracted from the sum before dividing. After re-reading the instructions for calculating the subscale scores, I don't think the participant should get a subscale score if they have any negatively coded items. "When a participant responds with don’t know, refuse, or missing, to any item on a given scale, their scale score will be missing..."
#AW: I was thinking something like the code below could work but realized I don't think we want to filter/subset this way. There will be some kids with int. scores but not ext. scores and vice versa. We could (a) create separate dataframes for participants with externalizing subscale scores and participants with internalizing subscale scores or (b) decide we want to restrict the sample to only include participants that had valid scores for all scales. I might also be overthinking this.
ff_sub2 <- ff_sub_orig %>%
filter(k5g2b >= 0 &
k5g2d >= 0 &
k5g2f >= 0 &
k5g2h >= 0 &
k5g2m >= 0 &
k5g2n >= 0
) %>%
rowwise() %>%
mutate(sdq_externalizing = mean(c(k5g2b,
k5g2d,
k5g2f,
k5g2h,
k5g2m,
k5g2n)
)
) %>%
ungroup()
#AW: I replicated the externalizing x suspensions/expulsions plot using the # of times expelled/suspended as reported by parents (p6c22), filtering to remove negative values first
p <- ff_sub2 %>%
filter(p6c22 >= 0) %>%
ggplot(aes(sdq_externalizing, p6c22))
p + geom_smooth(method = "lm",
color = "magenta") +
geom_smooth()
ff_sub_lm <- ff_sub %>%
rowwise() %>%
select(idnum,
starts_with("k6d6"),
starts_with("k5f1"),
int_scores,
ext_scores,
cm1bsex,
ck6ethrace,
p5l12g,
p6c22) %>% #AW: If you look at the ff_meta_subset df, p6c21 is a binary yes/no variable (C21. Youth ever been suspended/expelled?). I think we want p6c22 (# times reported by parent) or k6b30 (# times reported by student) instead. I changed this and subsequent instances to p6c22
filter(rowSums(across(where(is.numeric))) >= 0 &
ck6ethrace >= 0 &
p5l12g >= 0 &
p6c22 >= 0) %>%
mutate(del_beh_9 = sum(c_across(starts_with("k5f1"))), # binary yes/no
del_beh_15 = sum(c_across(starts_with("k6d6"))),#AW: I think this should be k6d61 (responses 1-4, with 4 being highest - see ff_meta_subset file) because the k6d62 variables are from the peer delinquency portion of the scale and have different responses (1, 2, 3), where lower scores correspond to greater delinquent behaviors (1 = often, 3 = never). I'm not sure negative responses are getting filtered out since there's a -8 in the resulting column
del_beh_15_self_rep = sum(c_across(starts_with("k6d61")))) #I added a column here using k6d61 for comparison
#trying out filtering zeroes out first and then joining with ff_sub_lm to compare
ff_sub_lm2 <- ff_sub %>%
filter(k6d61a > 0 &
k6d61b > 0 &
k6d61c > 0 &
k6d61d > 0 &
k6d61e > 0 &
k6d61f > 0 &
k6d61g > 0 &
k6d61h > 0 &
k6d61i > 0 &
k6d61j > 0 &
k6d61k > 0 &
k6d61l > 0 &
k6d61m > 0) %>%
rowwise() %>%
mutate(del_beh_15_self_rep2 = sum(c_across(starts_with("k6d61")))) %>%
select(idnum, del_beh_15_self_rep2)
ff_sub_lm3 <- left_join(ff_sub_lm, ff_sub_lm2)
means_df <- function(df, ...) {
means <- map(df, mean, ...)
nas <- map_lgl(means, is.na)
means_l <- means[!nas]
as.data.frame(means_l)
}
means_df(ff_sub)
## cm1bsex k5f1a k5f1b k5f1c k5f1d k5f1e k5f1f k5f1g
## 1 1.483629 1.848904 1.88435 1.908081 1.926705 1.689396 1.945329 1.919796
## k5f1h k5f1i k5f1j k5f1k k5f1l k5f1m k5f1n k5f1o
## 1 1.974166 1.975068 1.953139 1.990388 1.985882 1.816161 1.963352 1.981376
## k5f1p k5f1q k5g2a k5g2b k5g2c k5g2d k5g2e k5g2f
## 1 1.929709 1.943827 1.048363 0.8269751 1.528687 0.9008711 0.7065185 1.219585
## k5g2g k5g2h k5g2i k5g2j k5g2k k5g2l k5g2m k5g2n
## 1 0.6996095 0.7894263 1.455392 1.438871 0.8918594 1.42295 1.082908 0.6800841
## p5l12g p6c21 p6c22 ck6ethrace k6b29 k6b30 k6d61a
## 1 0.6794833 0.9933914 -3.956143 1.101532 0.7026134 -4.028237 0.1685191
## k6d61b k6d61c k6d61d k6d61e k6d61f k6d61g k6d61h
## 1 0.1952538 0.2258937 0.4466807 0.2427155 0.1628117 0.1559027 0.1432863
## k6d61i k6d61j k6d61k k6d61l k6d61m k6d62a k6d62b k6d62c
## 1 0.1459898 0.1595074 0.2349054 0.2847702 0.4629018 1.63833 1.497146 1.407029
## k6d62d k6d62e k6d62f k6d62g k6d62h k6d62i k6d62j k6d62k
## 1 1.721538 1.796636 1.880144 1.748573 1.739862 1.831481 1.735957 1.639231
## int_scores ext_scores
## 1 1.149031 0.9166416
ff_sub_lm$idnum <- as.numeric(ff_sub_lm$idnum)
ff_sub_lm$ck6ethrace <- as.numeric(ff_sub_lm$ck6ethrace)
lm_mods <- function (vardep, varindep1, varindep2, varindep3, DATA) {
summary(lm(paste(vardep, "~",
varindep1, "+",
varindep2, "+",
varindep3),
data = DATA))
}
lm_mods("del_beh_15", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.470 -0.928 1.403 2.656 16.614
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 44.8834 1.0911 41.136 <2e-16 ***
## ext_scores 0.2531 0.3175 0.797 0.426
## cm1bsex 0.3527 0.5022 0.702 0.483
## ck6ethrace -0.5726 0.2916 -1.964 0.050 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.326 on 671 degrees of freedom
## Multiple R-squared: 0.007143, Adjusted R-squared: 0.002704
## F-statistic: 1.609 on 3 and 671 DF, p-value: 0.186
lm_mods("del_beh_15", "int_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -52.252 -0.939 1.402 2.636 16.410
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 45.5536 1.0589 43.018 <2e-16 ***
## int_scores -0.2252 0.3332 -0.676 0.499
## cm1bsex 0.2896 0.4966 0.583 0.560
## ck6ethrace -0.5705 0.2918 -1.955 0.051 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 6.327 on 671 degrees of freedom
## Multiple R-squared: 0.006878, Adjusted R-squared: 0.002438
## F-statistic: 1.549 on 3 and 671 DF, p-value: 0.2006
lm_mods("del_beh_9", "int_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65.193 -0.807 0.655 1.555 3.340
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.11093 0.55734 55.821 < 2e-16 ***
## int_scores -0.60890 0.17539 -3.472 0.000550 ***
## cm1bsex 1.01349 0.26138 3.877 0.000116 ***
## ck6ethrace 0.02879 0.15357 0.187 0.851349
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.33 on 671 degrees of freedom
## Multiple R-squared: 0.03924, Adjusted R-squared: 0.03495
## F-statistic: 9.135 on 3 and 671 DF, p-value: 6.235e-06
lm_mods("del_beh_9", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -64.839 -0.650 0.453 1.314 3.805
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 32.5042 0.5493 59.178 < 2e-16 ***
## ext_scores -1.3917 0.1598 -8.708 < 2e-16 ***
## cm1bsex 0.6942 0.2528 2.746 0.00619 **
## ck6ethrace -0.0201 0.1468 -0.137 0.89113
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.185 on 671 degrees of freedom
## Multiple R-squared: 0.1213, Adjusted R-squared: 0.1174
## F-statistic: 30.87 on 3 and 671 DF, p-value: < 2.2e-16
lm_mods("p5l12g", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.95410 0.07816 0.14878 0.20082 0.31582
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.78997 0.06461 27.705 < 2e-16 ***
## ext_scores -0.06245 0.01880 -3.322 0.000943 ***
## cm1bsex 0.06021 0.02974 2.025 0.043263 *
## ck6ethrace 0.01093 0.01727 0.633 0.527141
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3746 on 671 degrees of freedom
## Multiple R-squared: 0.02639, Adjusted R-squared: 0.02204
## F-statistic: 6.062 on 3 and 671 DF, p-value: 0.0004487
lm_mods("p5l12g", "int_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9149 0.1050 0.1603 0.1979 0.2609
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.73262 0.06309 27.462 <2e-16 ***
## int_scores -0.03149 0.01985 -1.586 0.1132
## cm1bsex 0.07448 0.02959 2.517 0.0121 *
## ck6ethrace 0.01326 0.01738 0.763 0.4460
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.377 on 671 degrees of freedom
## Multiple R-squared: 0.01408, Adjusted R-squared: 0.009668
## F-statistic: 3.193 on 3 and 671 DF, p-value: 0.02313
lm_mods("p6c22", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.300 -1.604 -1.047 0.412 37.581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.95271 0.59566 3.278 0.0011 **
## ext_scores 0.42653 0.17332 2.461 0.0141 *
## cm1bsex -0.01671 0.27415 -0.061 0.9514
## ck6ethrace 0.02815 0.15920 0.177 0.8597
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.454 on 671 degrees of freedom
## Multiple R-squared: 0.009235, Adjusted R-squared: 0.004805
## F-statistic: 2.085 on 3 and 671 DF, p-value: 0.1009
lm_mods("p6c22", "ext_scores", "cm1bsex", "ck6ethrace", ff_sub_lm)
##
## Call:
## lm(formula = paste(vardep, "~", varindep1, "+", varindep2, "+",
## varindep3), data = DATA)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.300 -1.604 -1.047 0.412 37.581
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.95271 0.59566 3.278 0.0011 **
## ext_scores 0.42653 0.17332 2.461 0.0141 *
## cm1bsex -0.01671 0.27415 -0.061 0.9514
## ck6ethrace 0.02815 0.15920 0.177 0.8597
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.454 on 671 degrees of freedom
## Multiple R-squared: 0.009235, Adjusted R-squared: 0.004805
## F-statistic: 2.085 on 3 and 671 DF, p-value: 0.1009
mod_db_int_15 <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(del_beh_15 ~ int_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_db_ext_15 <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(del_beh_15 ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_db_int_9 <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(del_beh_9 ~ int_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_db_ext_9 <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(del_beh_9 ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_expulsion9_int <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(p5l12g ~ int_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_expulsion15_ext <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(p6c22 ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_expulsion9_ext <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(p5l12g ~ ext_scores + cm1bsex + ck6ethrace, data = .x)
)
)
mod_expulsion15_int <- ff_sub_lm %>%
group_by(idnum) %>%
nest() %>%
mutate(
model = map(
data, ~lm(p6c22 ~ int_scores + cm1bsex + ck6ethrace, data = .x)
)
)
pull_coef <- function(model, coef_name) {
coef(model)[coef_name]
}
mod_db_ext_15 %>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $ext_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 47 NA NA
## 2 14 <tibble [1 × 50]> <lm> -8 NA NA
## 3 19 <tibble [1 × 50]> <lm> 44 NA NA
## 4 25 <tibble [1 × 50]> <lm> 41 NA NA
## 5 28 <tibble [1 × 50]> <lm> 46 NA NA
## 6 30 <tibble [1 × 50]> <lm> 46 NA NA
## 7 33 <tibble [1 × 50]> <lm> 48 NA NA
## 8 40 <tibble [1 × 50]> <lm> 47 NA NA
## 9 48 <tibble [1 × 50]> <lm> 46 NA NA
## 10 52 <tibble [1 × 50]> <lm> 44 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_db_int_15 %>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $int_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 47 NA NA
## 2 14 <tibble [1 × 50]> <lm> -8 NA NA
## 3 19 <tibble [1 × 50]> <lm> 44 NA NA
## 4 25 <tibble [1 × 50]> <lm> 41 NA NA
## 5 28 <tibble [1 × 50]> <lm> 46 NA NA
## 6 30 <tibble [1 × 50]> <lm> 46 NA NA
## 7 33 <tibble [1 × 50]> <lm> 48 NA NA
## 8 40 <tibble [1 × 50]> <lm> 47 NA NA
## 9 48 <tibble [1 × 50]> <lm> 46 NA NA
## 10 52 <tibble [1 × 50]> <lm> 44 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_db_ext_9 %>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $ext_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 34 NA NA
## 2 14 <tibble [1 × 50]> <lm> 33 NA NA
## 3 19 <tibble [1 × 50]> <lm> 33 NA NA
## 4 25 <tibble [1 × 50]> <lm> 29 NA NA
## 5 28 <tibble [1 × 50]> <lm> 33 NA NA
## 6 30 <tibble [1 × 50]> <lm> 34 NA NA
## 7 33 <tibble [1 × 50]> <lm> 32 NA NA
## 8 40 <tibble [1 × 50]> <lm> 33 NA NA
## 9 48 <tibble [1 × 50]> <lm> 32 NA NA
## 10 52 <tibble [1 × 50]> <lm> 28 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_db_int_9 %>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $int_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 34 NA NA
## 2 14 <tibble [1 × 50]> <lm> 33 NA NA
## 3 19 <tibble [1 × 50]> <lm> 33 NA NA
## 4 25 <tibble [1 × 50]> <lm> 29 NA NA
## 5 28 <tibble [1 × 50]> <lm> 33 NA NA
## 6 30 <tibble [1 × 50]> <lm> 34 NA NA
## 7 33 <tibble [1 × 50]> <lm> 32 NA NA
## 8 40 <tibble [1 × 50]> <lm> 33 NA NA
## 9 48 <tibble [1 × 50]> <lm> 32 NA NA
## 10 52 <tibble [1 × 50]> <lm> 28 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion15_ext %>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $ext_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 2 NA NA
## 2 14 <tibble [1 × 50]> <lm> 0 NA NA
## 3 19 <tibble [1 × 50]> <lm> 0 NA NA
## 4 25 <tibble [1 × 50]> <lm> 1 NA NA
## 5 28 <tibble [1 × 50]> <lm> 1 NA NA
## 6 30 <tibble [1 × 50]> <lm> 0 NA NA
## 7 33 <tibble [1 × 50]> <lm> 2 NA NA
## 8 40 <tibble [1 × 50]> <lm> 2 NA NA
## 9 48 <tibble [1 × 50]> <lm> 1 NA NA
## 10 52 <tibble [1 × 50]> <lm> 20 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion15_int%>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $int_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 2 NA NA
## 2 14 <tibble [1 × 50]> <lm> 0 NA NA
## 3 19 <tibble [1 × 50]> <lm> 0 NA NA
## 4 25 <tibble [1 × 50]> <lm> 1 NA NA
## 5 28 <tibble [1 × 50]> <lm> 1 NA NA
## 6 30 <tibble [1 × 50]> <lm> 0 NA NA
## 7 33 <tibble [1 × 50]> <lm> 2 NA NA
## 8 40 <tibble [1 × 50]> <lm> 2 NA NA
## 9 48 <tibble [1 × 50]> <lm> 1 NA NA
## 10 52 <tibble [1 × 50]> <lm> 20 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion9_ext %>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $ext_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 2 NA NA
## 2 14 <tibble [1 × 50]> <lm> 2 NA NA
## 3 19 <tibble [1 × 50]> <lm> 2 NA NA
## 4 25 <tibble [1 × 50]> <lm> 1 NA NA
## 5 28 <tibble [1 × 50]> <lm> 1 NA NA
## 6 30 <tibble [1 × 50]> <lm> 2 NA NA
## 7 33 <tibble [1 × 50]> <lm> 2 NA NA
## 8 40 <tibble [1 × 50]> <lm> 2 NA NA
## 9 48 <tibble [1 × 50]> <lm> 2 NA NA
## 10 52 <tibble [1 × 50]> <lm> 1 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mod_expulsion9_int%>%
mutate(intercept = map_dfr(model, pull_coef))
## # A tibble: 675 × 4
## # Groups: idnum [675]
## idnum data model intercept$`(Intercept)` $int_scores $cm1bsex
## <dbl> <list> <list> <dbl> <dbl> <dbl>
## 1 8 <tibble [1 × 50]> <lm> 2 NA NA
## 2 14 <tibble [1 × 50]> <lm> 2 NA NA
## 3 19 <tibble [1 × 50]> <lm> 2 NA NA
## 4 25 <tibble [1 × 50]> <lm> 1 NA NA
## 5 28 <tibble [1 × 50]> <lm> 1 NA NA
## 6 30 <tibble [1 × 50]> <lm> 2 NA NA
## 7 33 <tibble [1 × 50]> <lm> 2 NA NA
## 8 40 <tibble [1 × 50]> <lm> 2 NA NA
## 9 48 <tibble [1 × 50]> <lm> 2 NA NA
## 10 52 <tibble [1 × 50]> <lm> 1 NA NA
## # … with 665 more rows, and 1 more variable: intercept$ck6ethrace <dbl>
mods <- function(data, x, y, points = FALSE, ...) {
p <- ggplot(data, aes({{x}}, {{y}}))
p + geom_smooth(method = "lm",
color = "magenta",
...) +
geom_smooth(...)
}
#AW: Do we want two fitted lines on the graphs? If so, it might be helpful to include a short note about the purpose of the two lines.
mods(ff_sub_lm, int_scores, del_beh_15) +
labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 15",
x = "Internalizing behavior score",
y = "Delinquency behaviors")
mods(ff_sub_lm, ext_scores, del_beh_15) +
labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 15",
x = "Externalizing behavior score",
y = "Delinquency behaviors")
mods(ff_sub_lm, int_scores, del_beh_9) +
labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 9",
x = "Internalizing behavior score",
y = "Delinquency behaviors")
mods(ff_sub_lm, ext_scores, del_beh_9) +
labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 9",
x = "Externalizing behavior score",
y = "Delinquency behaviors")
mods(ff_sub_lm, int_scores, p5l12g) +
labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 9",
x = "Internalizing behavior score",
y = "suspensions/expulsions")
#AW: p5l12g is a binary yes/no variable. It looks like the # of times the participant was suspended/expelled wasn't collected in Year 9, just a yes/no whether they were or were not.
mods(ff_sub_lm, ext_scores, p5l12g) +
labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 9",
x = "Externalizing behavior score",
y = "suspensions/expulsions")
#AW: p5l12g is a binary yes/no variable. It looks like the # of times the participant was suspended/expelled wasn't collected in Year 9, just a yes/no whether they were or were not.
mods(ff_sub_lm, int_scores, p6c22) +
labs(title = "Relationship between Internalizing year 9 x suspensions/expulsions in year 15",
x = "Internalizing behavior score",
y = "suspensions/expulsions")
mods(ff_sub_lm, ext_scores, p6c22) +
labs(title = "Relationship between Externalizing year 9 x suspensions/expulsions in year 15",
x = "Externalizing behavior score",
y = "suspensions/expulsions")
This section is still in progress
cm1bsex = gender of participant, as reported by mother during baseline data collection
ck6ethrace = race/ethnicity of participant, self-reported during Wave 6 / Year 15
p6c22 = Number of times youth has been suspended/expelled past two years as reported by primary caregiver in Wave 6/Year 15
# First step - recode race/eth
ethrace <- ff_sub_lm %>%
mutate(ck6ethrace = recode(ck6ethrace,
"1" = "White",
"2" = "Black",
"3" = "Hispanic/Latino",
"4" = "Other",
"5" = "Multiracial"))
# Beginning to play around with nest() %>% mutate. Here, I calculated the avg. suspensions/expulsions reported by primary caregiver for Year 15 for each race/eth category and generated a bar chart w/ displaying the mean suspensions/expulsion by race/ethnicity subgroup
by_ethrace <- ethrace %>%
group_by(ck6ethrace) %>%
nest() %>%
mutate(
avg_sus_exp = map_dbl(data, ~mean(.x$p6c22)),
)
by_ethrace %>%
ggplot(aes(y = avg_sus_exp, x = ck6ethrace)) +
geom_col()
I then used mutate() with the nested dataframe and map() to generate distributions for the following continuous variable by race/ethnicity:
These are definitely rough drafts. If we include these in our final product, my to-do list would include:
theme_minimal()I’d also like to replicate these for gender.
# Distributions of internalizing subscale scores at age 9 by race/ethnicity
by_ethrace <- by_ethrace %>%
mutate(
distributions_int_scores = map(
data, ~{
ggplot(.x, aes(int_scores)) +
geom_bar() +
labs(
title = "Distribution of Age 9 Internalizing Behavior Scores",
subtitle = glue("{ck6ethrace} Participants"),
x = "Internalizing Behavior Subscale Score",
y = ""
)
}
)
)
#by_ethrace$distributions_int_scores[[1]]
walk(by_ethrace$distributions_int_scores[1:5], print)
# Distributions of externalizing subscale scores at age 9 by race/ethnicity
by_ethrace <- by_ethrace %>%
mutate(
distributions_ext_scores = map(
data, ~{
ggplot(.x, aes(ext_scores)) +
geom_bar() +
labs(
title = "Distribution of Age 9 Externalizing Behavior Scores",
subtitle = glue("{ck6ethrace} Participants"),
x = "Externalizing Behavior Subscale Score",
y = ""
)
}
)
)
#by_ethrace$distributions_ext_scores[[1]]
walk(by_ethrace$distributions_ext_scores[1:5], print)
#Distributions of delinquent behavior subscale scores reported at age 15 by race/ethnicity subgroup
by_ethrace <- by_ethrace %>%
mutate(
distributions_del_beh_15 = map(
data, ~{
ggplot(.x, aes(del_beh_15_self_rep)) +
geom_bar() +
labs(
title = "Distribution of Age 15 Delinquent Behavior Scores",
subtitle = glue("{ck6ethrace} Participants"),
x = "Delinquent Behavior Total Score",
y = ""
)
}
)
)
#by_ethrace$distributions_del_beh_15[[1]]
walk(by_ethrace$distributions_del_beh_15[1:5], print)
#Distributions of reported suspensions and expulsions reported at age 15.
by_ethrace <- by_ethrace %>%
mutate(
distributions_susp_exp_15 = map(
data, ~{
ggplot(.x, aes(p6c22)) +
geom_bar() +
labs(
title = "Distribution of Suspensions & Explusions at Age 15",
subtitle = glue("{ck6ethrace} Participants"),
x = "Reported Suspensions and Explusions",
y = ""
)
}
)
)
#by_ethrace$distributions_susp_exp_15[[1]]
walk(by_ethrace$distributions_susp_exp_15[1:5], print)
ff_sub_lm1 <- ff_sub_lm %>%
select( int_scores, ext_scores, cm1bsex, ck6ethrace,
del_beh_9, del_beh_15, p5l12g, p6c22)
ff_sub_lm1$cm1bsex <- factor(ff_sub_lm1$cm1bsex, labels = c("male","female"))
ff_sub_lm1$ck6ethrace <- as.factor(ff_sub_lm1$ck6ethrace)
ff_sub_lm1$ck6ethrace <- recode (ff_sub_lm1$ck6ethrace, "1" = "white",
"2" = "black",
"3" = "hispanic",
"4" = "other",
"5" = "multi-racial")
#I tried to use the following code with *map* but could not make it work
#ff_sub_lm <- map(ff_sub_lm, ~{
# .x %>%
# mutate(cm1bsex = factor(cm1bsex, labels = c("male","female")),
# ck6ethrace = factor(ck6ethrace, labels = c("white",
# "black",
# "hispanic",
#"other",
# "multi-racial")))
#})
#Function for a scatterplot with a fitted regression line
scatter1 <- function(df, DV, IV, group) {
var1 <- deparse(substitute(DV))
var2 <- deparse(substitute(IV))
p = df %>%
ggplot() +
geom_point(aes(x = IV, y = DV), color = "gray50", stroke = 0, alpha = .6) +
geom_smooth(method = lm, se = FALSE,
aes(x = IV, y = DV, color = group)) +
scale_y_continuous(expand = c(0,0),
breaks = c(35, 40, 45, 50)) +
coord_cartesian(ylim = c(35, 55 )) +
theme_minimal(15) +
labs(x = print(var2),
y = print(var1)) +
theme(plot.title.position = "plot",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank())
ifelse(var2 == "ff_sub_lm1$int_scores",
p <- p + labs(title = "Delinquent Behavior at 15 predicted by Internalizing scores at 9"),
ifelse (var2 == "ff_sub_lm1$ext_scores",
p <- p + labs(title = "Delinquent Behavior at 15 predicted by Externalizing scores at 9"),
p <- p))
p
}
#Del. Behavior at 15 by Internalizing scores at 9, grouped by Race
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$int_scores, ff_sub_lm1$ck6ethrace)
## [1] "ff_sub_lm1$int_scores"
## [1] "ff_sub_lm1$del_beh_15"
#Del. Behavior at 15 by Internalizing scores at 9, grouped by Gender
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$int_scores, ff_sub_lm1$cm1bsex)
## [1] "ff_sub_lm1$int_scores"
## [1] "ff_sub_lm1$del_beh_15"
#Del. Behavior at 15 by Externalizing scores at 9, grouped by Race
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$ext_scores, ff_sub_lm1$ck6ethrace)
## [1] "ff_sub_lm1$ext_scores"
## [1] "ff_sub_lm1$del_beh_15"
#Del. Behavior at 15 by Externalizing scores at 9, grouped by Gender
scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, ff_sub_lm1$ext_scores, ff_sub_lm1$cm1bsex)
## [1] "ff_sub_lm1$ext_scores"
## [1] "ff_sub_lm1$del_beh_15"
#Plots for both internalizing and externalizing behavior grouped by gender
map(ff_sub_lm1[1:2],
~{scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, .x, ff_sub_lm1$cm1bsex)})
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## $int_scores
##
## $ext_scores
#Plots for both internalizing and externalizing behavior grouped by gender
map(ff_sub_lm1[1:2],
~{scatter1(ff_sub_lm1, ff_sub_lm1$del_beh_15, .x, ff_sub_lm1$ck6ethrace)})
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## [1] ".x"
## [1] "ff_sub_lm1$del_beh_15"
## $int_scores
##
## $ext_scores
#Function for a scatterplot with a fitted regression line (Internalizing behavior * delinquency)
scatter2 <- function(df, group) {
p = df %>%
ggplot() +
geom_point(aes(x = int_scores, y = del_beh_15), color = "gray50",
stroke = 0, alpha = .6) +
geom_smooth(method = lm, se = FALSE,
aes(x = int_scores, y = del_beh_15, color = cm1bsex)) +
scale_y_continuous(expand = c(0,0),
breaks = c(35, 40, 45, 50)) +
coord_cartesian(ylim = c(35, 55 )) +
theme_minimal(15) +
scale_color_OkabeIto(name = "Gender") +
theme(plot.title.position = "plot",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
title =element_text(size=8))
p + labs (title = paste("Delinquent Behavior at 15 predicted by Internalizing scores at 9", group, sep = ": "))
}
nest_df = ff_sub_lm1 %>%
group_by(ck6ethrace) %>%
nest()
plots2 <- map2(nest_df$data, nest_df$ck6ethrace,
~scatter2(.x, .y))
ggpubr::ggarrange(plots2[[1]], plots2[[2]], plots2[[3]], plots2[[4]], plots2[[5]],
ncol = 2, nrow = 3,
common.legend = TRUE,
legend = 'bottom')
#Function for a scatterplot with a fitted regression line (Externalizing behavior * delinquency)
scatter3 <- function(df, group) {
p = df %>%
ggplot() +
geom_point(aes(x = ext_scores, y = del_beh_15), color = "gray50", stroke = 0, alpha = .6) +
geom_smooth(method = lm, se = FALSE,
aes(x = ext_scores, y = del_beh_15, color = ck6ethrace)) +
scale_y_continuous(expand = c(0,0),
breaks = c(35, 40, 45, 50))+
coord_cartesian(ylim = c(35, 55 )) +
theme_minimal(15) +
scale_color_OkabeIto(name = "Race/Ethnicity") +
theme(plot.title.position = "plot",
panel.grid.minor.y = element_blank(),
panel.grid.minor.x = element_blank(),
title =element_text(size=8))
p + labs (title = paste("Delinquent Behavior at 15 predicted by Externalizing scores at 9", group, sep = ": "))
}
nest_df = ff_sub_lm1 %>%
group_by(cm1bsex) %>%
nest()
plots3 <- map2(nest_df$data, nest_df$cm1bsex,
~scatter3(.x, .y))
ggpubr::ggarrange(plots3[[1]], plots3[[2]],
ncol = 2, nrow = 1,
common.legend = TRUE,
legend = 'bottom')